from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,precision_score,f1_score,fbeta_score,classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import seaborn as sns
import pandas as pd
import numpy as np
import sklearn
import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px
df = pd.read_csv(r'C:\Users\lenovo\Downloads\heart_disease_health_indicators.csv')
df.shape
(253680, 22)
df.head()
| HeartDiseaseorAttack | HighBP | HighChol | CholCheck | BMI | Smoker | Stroke | Diabetes | PhysActivity | Fruits | ... | AnyHealthcare | NoDocbcCost | GenHlth | MentHlth | PhysHlth | DiffWalk | Sex | Age | Education | Income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 1.0 | 1.0 | 1.0 | 40.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 5.0 | 18.0 | 15.0 | 1.0 | 0.0 | 9.0 | 4.0 | 3.0 |
| 1 | 0.0 | 0.0 | 0.0 | 0.0 | 25.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | ... | 0.0 | 1.0 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 7.0 | 6.0 | 1.0 |
| 2 | 0.0 | 1.0 | 1.0 | 1.0 | 28.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | ... | 1.0 | 1.0 | 5.0 | 30.0 | 30.0 | 1.0 | 0.0 | 9.0 | 4.0 | 8.0 |
| 3 | 0.0 | 1.0 | 0.0 | 1.0 | 27.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | ... | 1.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 11.0 | 3.0 | 6.0 |
| 4 | 0.0 | 1.0 | 1.0 | 1.0 | 24.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | ... | 1.0 | 0.0 | 2.0 | 3.0 | 0.0 | 0.0 | 0.0 | 11.0 | 5.0 | 4.0 |
5 rows × 22 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 253680 entries, 0 to 253679 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 HeartDiseaseorAttack 253680 non-null float64 1 HighBP 253680 non-null float64 2 HighChol 253680 non-null float64 3 CholCheck 253680 non-null float64 4 BMI 253680 non-null float64 5 Smoker 253680 non-null float64 6 Stroke 253680 non-null float64 7 Diabetes 253680 non-null float64 8 PhysActivity 253680 non-null float64 9 Fruits 253680 non-null float64 10 Veggies 253680 non-null float64 11 HvyAlcoholConsump 253680 non-null float64 12 AnyHealthcare 253680 non-null float64 13 NoDocbcCost 253680 non-null float64 14 GenHlth 253680 non-null float64 15 MentHlth 253680 non-null float64 16 PhysHlth 253680 non-null float64 17 DiffWalk 253680 non-null float64 18 Sex 253680 non-null float64 19 Age 253680 non-null float64 20 Education 253680 non-null float64 21 Income 253680 non-null float64 dtypes: float64(22) memory usage: 42.6 MB
df.describe()
| HeartDiseaseorAttack | HighBP | HighChol | CholCheck | BMI | Smoker | Stroke | Diabetes | PhysActivity | Fruits | ... | AnyHealthcare | NoDocbcCost | GenHlth | MentHlth | PhysHlth | DiffWalk | Sex | Age | Education | Income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | ... | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 |
| mean | 0.094186 | 0.429001 | 0.424121 | 0.962670 | 28.382364 | 0.443169 | 0.040571 | 0.296921 | 0.756544 | 0.634256 | ... | 0.951053 | 0.084177 | 2.511392 | 3.184772 | 4.242081 | 0.168224 | 0.440342 | 8.032119 | 5.050434 | 6.053875 |
| std | 0.292087 | 0.494934 | 0.494210 | 0.189571 | 6.608694 | 0.496761 | 0.197294 | 0.698160 | 0.429169 | 0.481639 | ... | 0.215759 | 0.277654 | 1.068477 | 7.412847 | 8.717951 | 0.374066 | 0.496429 | 3.054220 | 0.985774 | 2.071148 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 12.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
| 25% | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 24.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | ... | 1.000000 | 0.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 | 4.000000 | 5.000000 |
| 50% | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 27.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 0.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 8.000000 | 5.000000 | 7.000000 |
| 75% | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 31.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 0.000000 | 3.000000 | 2.000000 | 3.000000 | 0.000000 | 1.000000 | 10.000000 | 6.000000 | 8.000000 |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 98.000000 | 1.000000 | 1.000000 | 2.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 5.000000 | 30.000000 | 30.000000 | 1.000000 | 1.000000 | 13.000000 | 6.000000 | 8.000000 |
8 rows × 22 columns
df.isnull().sum()
HeartDiseaseorAttack 0 HighBP 0 HighChol 0 CholCheck 0 BMI 0 Smoker 0 Stroke 0 Diabetes 0 PhysActivity 0 Fruits 0 Veggies 0 HvyAlcoholConsump 0 AnyHealthcare 0 NoDocbcCost 0 GenHlth 0 MentHlth 0 PhysHlth 0 DiffWalk 0 Sex 0 Age 0 Education 0 Income 0 dtype: int64
df.duplicated().sum()
23899
df.drop_duplicates(inplace = True)
df.duplicated().sum()
0
df.shape
(229781, 22)
df['Age'] = df['Age']*4
df
| HeartDiseaseorAttack | HighBP | HighChol | CholCheck | BMI | Smoker | Stroke | Diabetes | PhysActivity | Fruits | ... | AnyHealthcare | NoDocbcCost | GenHlth | MentHlth | PhysHlth | DiffWalk | Sex | Age | Education | Income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 1.0 | 1.0 | 1.0 | 40.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 5.0 | 18.0 | 15.0 | 1.0 | 0.0 | 36.0 | 4.0 | 3.0 |
| 1 | 0.0 | 0.0 | 0.0 | 0.0 | 25.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | ... | 0.0 | 1.0 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 28.0 | 6.0 | 1.0 |
| 2 | 0.0 | 1.0 | 1.0 | 1.0 | 28.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | ... | 1.0 | 1.0 | 5.0 | 30.0 | 30.0 | 1.0 | 0.0 | 36.0 | 4.0 | 8.0 |
| 3 | 0.0 | 1.0 | 0.0 | 1.0 | 27.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | ... | 1.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 44.0 | 3.0 | 6.0 |
| 4 | 0.0 | 1.0 | 1.0 | 1.0 | 24.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | ... | 1.0 | 0.0 | 2.0 | 3.0 | 0.0 | 0.0 | 0.0 | 44.0 | 5.0 | 4.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 253675 | 0.0 | 1.0 | 1.0 | 1.0 | 45.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | ... | 1.0 | 0.0 | 3.0 | 0.0 | 5.0 | 0.0 | 1.0 | 20.0 | 6.0 | 7.0 |
| 253676 | 0.0 | 1.0 | 1.0 | 1.0 | 18.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 4.0 | 0.0 | 0.0 | 1.0 | 0.0 | 44.0 | 2.0 | 4.0 |
| 253677 | 0.0 | 0.0 | 0.0 | 1.0 | 28.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | ... | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 8.0 | 5.0 | 2.0 |
| 253678 | 0.0 | 1.0 | 0.0 | 1.0 | 23.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | ... | 1.0 | 0.0 | 3.0 | 0.0 | 0.0 | 0.0 | 1.0 | 28.0 | 5.0 | 1.0 |
| 253679 | 1.0 | 1.0 | 1.0 | 1.0 | 25.0 | 0.0 | 0.0 | 2.0 | 1.0 | 1.0 | ... | 1.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 36.0 | 6.0 | 2.0 |
229781 rows × 22 columns
df['Age'].max()
52.0
df['Age'].min()
4.0
df["HeartDiseaseorAttack"].value_counts()
0.0 206064 1.0 23717 Name: HeartDiseaseorAttack, dtype: int64
df.columns
Index(['HeartDiseaseorAttack', 'HighBP', 'HighChol', 'CholCheck', 'BMI',
'Smoker', 'Stroke', 'Diabetes', 'PhysActivity', 'Fruits', 'Veggies',
'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
'Income'],
dtype='object')
HeartDiseaseorAttack: A binary variable indicating whether the individual has a history of heart disease or heart attack. HighBP: A binary variable indicating whether the individual has high blood pressure. HighChol: A binary variable indicating whether the individual has high cholesterol. CholCheck: A binary variable indicating whether the individual has had their cholesterol checked. BMI: A continuous variable representing the individual's body mass index, which is a measure of body fat based on height and weight. Smoker: A binary variable indicating whether the individual smokes cigarettes or not. Stroke: A binary variable indicating whether the individual has had a stroke. Diabetes: A binary variable indicating whether the individual has diabetes. PhysActivity: A categorical variable indicating the level of physical activity of the individual. Fruits: A continuous variable representing the number of servings of fruits the individual consumes. Veggies: A continuous variable representing the number of servings of vegetables the individual consumes. HvyAlcoholConsump: A binary variable indicating whether the individual has heavy alcohol consumption. AnyHealthcare: A binary variable indicating whether the individual has any healthcare coverage. NoDocbcCost: A binary variable indicating whether the individual does not have any doctor's visits due to cost. GenHlth: A categorical variable indicating the general health status of the individual. MentHlth: A categorical variable indicating the mental health status of the individual. PhysHlth: A continuous variable representing the number of days the individual's physical health was not good. DiffWalk: A binary variable indicating whether the individual has difficulty walking. Sex: A categorical variable indicating the gender of the individual. Age: A continuous variable representing the age of the individual. Education: A categorical variable indicating the highest level of education completed by the individual. Income: A categorical variable indicating the income level of the individual.
print("no. of column = ", len(df.columns))
no. of column = 22
df1 = df.drop([ 'Education', 'Income'], axis=1)
df1
| HeartDiseaseorAttack | HighBP | HighChol | CholCheck | BMI | Smoker | Stroke | Diabetes | PhysActivity | Fruits | Veggies | HvyAlcoholConsump | AnyHealthcare | NoDocbcCost | GenHlth | MentHlth | PhysHlth | DiffWalk | Sex | Age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 1.0 | 1.0 | 1.0 | 40.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 5.0 | 18.0 | 15.0 | 1.0 | 0.0 | 36.0 |
| 1 | 0.0 | 0.0 | 0.0 | 0.0 | 25.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 28.0 |
| 2 | 0.0 | 1.0 | 1.0 | 1.0 | 28.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 5.0 | 30.0 | 30.0 | 1.0 | 0.0 | 36.0 |
| 3 | 0.0 | 1.0 | 0.0 | 1.0 | 27.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 44.0 |
| 4 | 0.0 | 1.0 | 1.0 | 1.0 | 24.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 2.0 | 3.0 | 0.0 | 0.0 | 0.0 | 44.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 253675 | 0.0 | 1.0 | 1.0 | 1.0 | 45.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 3.0 | 0.0 | 5.0 | 0.0 | 1.0 | 20.0 |
| 253676 | 0.0 | 1.0 | 1.0 | 1.0 | 18.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 4.0 | 0.0 | 0.0 | 1.0 | 0.0 | 44.0 |
| 253677 | 0.0 | 0.0 | 0.0 | 1.0 | 28.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 8.0 |
| 253678 | 0.0 | 1.0 | 0.0 | 1.0 | 23.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 3.0 | 0.0 | 0.0 | 0.0 | 1.0 | 28.0 |
| 253679 | 1.0 | 1.0 | 1.0 | 1.0 | 25.0 | 0.0 | 0.0 | 2.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 36.0 |
229781 rows × 20 columns
correlation = df1.corr()
#correlation.style.background_gradient(cmap = 'BrBG')
plt.figure(figsize = (8,8))
sns.heatmap(correlation, cbar=True, square=True, fmt='.1f',annot=True, annot_kws={'size':8}, cmap='Purples')
<AxesSubplot:>
#The interquartile range (IQR) is a measure of statistical dispersion,prints the IQR scores, which can be used to detect outliers.
Q1 = df1.quantile(0.25)
Q3 = df1.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
HeartDiseaseorAttack 0.0 HighBP 1.0 HighChol 1.0 CholCheck 0.0 BMI 8.0 Smoker 1.0 Stroke 0.0 Diabetes 0.0 PhysActivity 1.0 Fruits 1.0 Veggies 0.0 HvyAlcoholConsump 0.0 AnyHealthcare 0.0 NoDocbcCost 0.0 GenHlth 1.0 MentHlth 2.0 PhysHlth 4.0 DiffWalk 0.0 Sex 1.0 Age 16.0 dtype: float64
# Create a figure and axis
fig, ax = plt.subplots(figsize=(75, 30))
# Plot the box plots for all columns
df1.boxplot(ax=ax)
plt.ylim(bottom=0, top=2500)
# Show the plot
plt.show()
sns.boxplot(df1['BMI'])
C:\Users\lenovo\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='BMI'>
sns.boxplot(df1['PhysHlth'])
C:\Users\lenovo\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='PhysHlth'>
data = [df1["HeartDiseaseorAttack"], df1["Age"], df1["BMI"], df1["PhysHlth"]]
fig = plt.figure(figsize =(10, 7))
ax = fig.add_subplot(111)
# Creating axes instance
bp = ax.boxplot(data, patch_artist = True,
notch ='True', vert = 0)
colors = ['#0000FF', '#00FF00',
'#FFFF00', '#FF00FF']
for patch, color in zip(bp['boxes'], colors):
patch.set_facecolor(color)
# changing color and linewidth of
# whiskers
for whisker in bp['whiskers']:
whisker.set(color ='#8B008B',
linewidth = 1.5,
linestyle =":")
# changing color and linewidth of
# caps
for cap in bp['caps']:
cap.set(color ='#8B008B',
linewidth = 2)
# changing color and linewidth of
# medians
for median in bp['medians']:
median.set(color ='red',
linewidth = 3)
# changing style of fliers
for flier in bp['fliers']:
flier.set(marker ='D',
color ='#e7298a',
alpha = 0.5)
# x-axis labels
ax.set_yticklabels(['HeartDiseaseorAttack', 'Age',
'BMI', 'PhysHlth'])
# Adding title
plt.title("Customized box plot")
# Removing top axes and right axes
# ticks
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
# show plot
plt.show()
df2 = df1[~((df1 < (Q1 - 1.5 * IQR)) |(df1 > (Q3 + 1.5 * IQR))).any(axis=1)]
print(df2.shape)
(79138, 20)
x=['HeartDiseaseorAttack','Age','HighChol','Diabetes','HighBP', 'Fruits','Smoker']
for i in x :
fig = px.histogram(df1, x=df1[i], nbins=50)
fig.show()
sns.pairplot(df1)
<seaborn.axisgrid.PairGrid at 0x21125850610>
sns.countplot(x='Sex', data=df1, hue='Smoker')
<AxesSubplot:xlabel='Sex', ylabel='count'>
sns.countplot(x='Smoker', data=df1, hue='HeartDiseaseorAttack')
<AxesSubplot:xlabel='Smoker', ylabel='count'>
#Make a countplot on the features to differentiate them into binary, categorical and numerical features
# Countplot on each feature
plt.figure(figsize=(20,60))
for i,column in enumerate(df1.columns):
plt.subplot(len(df1.columns), 5, i+1)
plt.suptitle("Plot Value Count", fontsize=20, x=0.5, y=1)
sns.countplot(data=df1, x=column)
plt.title(f"{column}")
plt.tight_layout()
# Separate into target, binary, categorical and numerical features
target = ['HeartDiseaseorAttack']
bin_features = ['HighBP', 'HighChol', 'CholCheck','Smoker', 'Stroke','PhysActivity', 'Fruits', 'Veggies',
'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex']
cat_features = ['Diabetes', 'GenHlth']
num_features = ['BMI','MentHlth', 'PhysHlth', 'Age']
# Explore distribtuion of binary features with pie charts
plt.figure(figsize=(20,60))
for i,column in enumerate(bin_features):
plt.subplot(len(bin_features), 5, i+1)
plt.pie(x=df1[column].value_counts(), labels=df1[column].unique(), autopct='%.0f%%')
plt.title(f"{column}")
plt.tight_layout()
# Explore distribtuion of numerical features histograms
fig, axes = plt.subplots(2, 2, figsize=(18, 10))
sns.histplot(ax=axes[0,0], data=df1, x=num_features[0])
sns.histplot(ax=axes[0,1], data=df1, x=num_features[1])
sns.histplot(ax=axes[1,0], data=df1, x=num_features[2])
sns.histplot(ax=axes[1,1], data=df1, x=num_features[3])
<AxesSubplot:xlabel='Age', ylabel='Count'>
sns.distplot(df1['Age'], kde=False, color='purple')
C:\Users\lenovo\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='Age'>
sns.distplot(df2['BMI'], kde=False)
C:\Users\lenovo\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='BMI'>
#the relation between Body mass and heart Diseases and if body mass can affect on person with making
#it possible to have heart Diseases or any other Diseases or not
sns.countplot(x='BMI', data=df1, hue='HeartDiseaseorAttack')
<AxesSubplot:xlabel='BMI', ylabel='count'>
sns.countplot(x='BMI', data=df1, hue='Diabetes')
<AxesSubplot:xlabel='BMI', ylabel='count'>
#how healthcare is so important and how it affect on person and make him good more than people who don't have
sns.countplot(x='AnyHealthcare', data=df1, hue='HeartDiseaseorAttack')
<AxesSubplot:xlabel='AnyHealthcare', ylabel='count'>
sns.countplot(x='AnyHealthcare', data=df1, hue='Diabetes')
<AxesSubplot:xlabel='AnyHealthcare', ylabel='count'>
x=df1.drop(['HeartDiseaseorAttack'], axis=1)
y=df1['HeartDiseaseorAttack']
x_train, x_test, y_train, y_test = train_test_split( x , y ,test_size=0.3)
y_train.value_counts()
0.0 144296 1.0 16550 Name: HeartDiseaseorAttack, dtype: int64
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)
models={
'LR':LogisticRegression(),
'KNN':KNeighborsClassifier(),
'DT':DecisionTreeClassifier(),
'SVC':SVC(),
'NB':GaussianNB(),
'RF':RandomForestClassifier()
}
for name,model in models.items():
print(f'using {name}: ')
model.fit(x_train,y_train)
y_pred=model.predict(x_test)
print(f'Training Accuracy :{accuracy_score(y_train,model.predict(x_train))}')
print(f'Testing Accuracy :{accuracy_score(y_test,y_pred)}')
print(f'Confusion matrix:\n {confusion_matrix(y_test,y_pred)}')
print(f'Recall: {recall_score(y_test,y_pred)}')
print(f'precision: {precision_score(y_test,y_pred)}')
print(f'F1-score: {f1_score(y_test,y_pred)}')
print(f'Fbeta-score: {fbeta_score(y_test,y_pred,beta=0.5)}')
print(classification_report(y_test,y_pred))
print('-'*33)
using LR:
Training Accuracy :0.8988535617920247
Testing Accuracy :0.8987161819104954
Confusion matrix:
[[61081 687]
[ 6295 872]]
Recall: 0.12166875959257709
precision: 0.5593329057087877
F1-score: 0.19986247994499198
Fbeta-score: 0.3253003059016637
precision recall f1-score support
0.0 0.91 0.99 0.95 61768
1.0 0.56 0.12 0.20 7167
accuracy 0.90 68935
macro avg 0.73 0.56 0.57 68935
weighted avg 0.87 0.90 0.87 68935
---------------------------------
using KNN:
Training Accuracy :0.9113748554517986
Testing Accuracy :0.8853267570900123
Confusion matrix:
[[59904 1864]
[ 6041 1126]]
Recall: 0.157108971675736
precision: 0.3765886287625418
F1-score: 0.22171901151914933
Fbeta-score: 0.29434830344539137
precision recall f1-score support
0.0 0.91 0.97 0.94 61768
1.0 0.38 0.16 0.22 7167
accuracy 0.89 68935
macro avg 0.64 0.56 0.58 68935
weighted avg 0.85 0.89 0.86 68935
---------------------------------
using DT:
Training Accuracy :0.9861482411747883
Testing Accuracy :0.844215565387684
Confusion matrix:
[[56364 5404]
[ 5335 1832]]
Recall: 0.25561601785963445
precision: 0.2531785516860144
F1-score: 0.25439144622648063
Fbeta-score: 0.2536623189609814
precision recall f1-score support
0.0 0.91 0.91 0.91 61768
1.0 0.25 0.26 0.25 7167
accuracy 0.84 68935
macro avg 0.58 0.58 0.58 68935
weighted avg 0.84 0.84 0.84 68935
---------------------------------
using SVC:
Training Accuracy :0.9011041617447745
Testing Accuracy :0.8979183288605208
Confusion matrix:
[[61510 258]
[ 6779 388]]
Recall: 0.05413701688293568
precision: 0.6006191950464397
F1-score: 0.09932164341482146
Fbeta-score: 0.19895395344067274
precision recall f1-score support
0.0 0.90 1.00 0.95 61768
1.0 0.60 0.05 0.10 7167
accuracy 0.90 68935
macro avg 0.75 0.52 0.52 68935
weighted avg 0.87 0.90 0.86 68935
---------------------------------
using NB:
Training Accuracy :0.8110739465078398
Testing Accuracy :0.81231594980779
Confusion matrix:
[[52141 9627]
[ 3311 3856]]
Recall: 0.5380214873726803
precision: 0.28598976488911965
F1-score: 0.3734624697336562
Fbeta-score: 0.3155534460465801
precision recall f1-score support
0.0 0.94 0.84 0.89 61768
1.0 0.29 0.54 0.37 7167
accuracy 0.81 68935
macro avg 0.61 0.69 0.63 68935
weighted avg 0.87 0.81 0.84 68935
---------------------------------
using RF:
Training Accuracy :0.9860860699053753
Testing Accuracy :0.8861246101399869
Confusion matrix:
[[60097 1671]
[ 6179 988]]
Recall: 0.1378540532998465
precision: 0.37156825874388866
F1-score: 0.20109912477101569
Fbeta-score: 0.2774813233724653
precision recall f1-score support
0.0 0.91 0.97 0.94 61768
1.0 0.37 0.14 0.20 7167
accuracy 0.89 68935
macro avg 0.64 0.56 0.57 68935
weighted avg 0.85 0.89 0.86 68935
---------------------------------
Histograms: To visualize the distribution of continuous features such as age. Box plots: To visualize the distribution and spread of continuous features and detect outliers. Bar plots: To visualize the frequency distribution of categorical features such as education. Heatmaps: To visualize the correlation matrix between different features. Scatter plots: To observe relationships between variables and uses dots to represent the relationship between them. countplot: To method is used to Show the counts of observations in each categorical bin using bars.